In [7]:
library(MASS)
In [8]:
# install.packages('/notebooks/ISLR_1.0.tar.gz')
In [9]:
library(ISLR) # library from the book
In [10]:
names(Boston) # housing values in suburbs of Boston; part of MASS library
Out[10]:
In [11]:
?Boston
Out[11]:
In [12]:
# lstat: lower status of the population (percent)
# medv: median value of owner-occupied homes in $1000s
par(mfrow = c(1,2))
plot(Boston$lstat, Boston$medv)
# different notation, same result: plots Y-axis~X-axis from Boston dataframe
plot(medv~lstat, Boston)
In [13]:
# create a linear model about the same variables
# reads as: response medv is modelled as (the single predictor) lstat
fit1=lm(medv~lstat, data=Boston)
In [14]:
fit1 # there's a negative correlation
Out[14]:
In [15]:
# as the % of 'lower status' inhabitants increases,
# the median housing value decreases
summary(fit1) # very significant results
Out[15]:
In [16]:
plot(medv~lstat, Boston)
abline(fit1, col="red") # abline: add straight lines to a plot
In [17]:
names(fit1)
Out[17]:
In [18]:
# 95% confidence intervals for the coefficents of the fit
confint(fit1)
Out[18]:
In [19]:
# predict() can be used to query a model fit
# here: predict medv (and conf intervals) given 3 new values for lstat
predict(fit1, data.frame(lstat=c(5,10,15)), interval="confidence")
# returns the fit at the 3 vals, their lower/upper confidence intervals
Out[19]:
In [20]:
# plot(fit1) # gives lots of plots / stat summaries
In [21]:
fit2=lm(medv~lstat+age, data=Boston)
summary(fit2)
# age is also a significant predictor for medv,
# but not as strong as lstat
# multiple R-squared (% of variance explained; higher is better)
Out[21]:
In [22]:
# use all variables (except for medv) to predict medv
fit3=lm(medv~.,Boston)
summary(fit3)
# age: was significant, when it was in the model w/ lstat
# but w/ all other predictors, it is no longer significant
Out[22]:
In [23]:
par(mfrow = c(2,2))
plot(fit3)
# residuals vs. fitted values: we're looking for non-linearity
# we see here, that the model doesn't capture everything that is
# going on
In [24]:
# update(): update a model
# same response, use all predictors from fit3 except for age and indus
fit4=update(fit3, ~.-age-indus)
# we're keeping only very significant predictors
In [25]:
par(mfrow = c(2,2))
plot(fit4)
In [26]:
summary(fit4)
Out[26]:
In [27]:
# linear model with an interaction between lstat and age
fit5 = lm(medv~lstat*age, Boston)
# there's a somewhat significant interaction betw. lstat and age
# (although age in itself is not significant)
In [28]:
summary(fit5)
Out[28]:
In [29]:
# we use a quadratic term `lstat^2` here,
# but `^` means sth. in this formula language, so we'll
# escape it using the identity function I()
fit6 = lm(medv~lstat +I(lstat^2), Boston)
In [30]:
summary(fit6)
Out[30]:
In [31]:
attach(Boston) # put Boston vars in local namespace
In [32]:
plot(medv~lstat)
# add quadratic fit to plot,
# i.e. for each val from lstat print the fitted val of the model.
# we can't use abline(), b/c this isn't linear
points(lstat, fitted(fit6), col="red", pch=20)
In [33]:
plot(medv~lstat)
# fit medv as a polynomial of degree 4 in lstat
fit7 = lm(medv~poly(lstat, 4))
points(lstat, fitted(fit7), col="blue", pch=20)
# risk of overfitting the data
In [34]:
# cex: how much should the plotting symbol be magnified
# e.g. cex.axis=2 increases axis labels
plot(1:20, 1:20, pch=1:20, cex=2)
In [40]:
# ?Carseats # simul. data set containing sales of child car seats at 400 different stores.
names(Carseats) # TODO: download other library
Out[40]:
In [38]:
summary(Carseats)
Out[38]:
In [44]:
# use all predictors from the frame (except for Sales) to predict Sales
# add interactions betw. Income and Advertising, as well as Age and Price
fit1 = lm(Sales~.+Income:Advertising+Age:Price, Carseats)
In [46]:
summary(fit1)
# signif. interaction betw. Income and Advertising
# Age:Price interaction isn't signif.
Out[46]:
In [50]:
#?contrasts # Set and view the contrasts associated with a factor.
In [51]:
contrasts(Carseats$ShelveLoc)
# shows how R will encode that variable in a linear model
# ShelveLoc: a 3-level factor
Out[51]:
In [106]:
regplot <- function(x,y) {
fit = lm(y~x)
plot(x,y)
abline(fit, col="red")
summary(fit)
}
In [105]:
regplot(Carseats$Price, Carseats$Sales)
Out[105]:
In [109]:
# ... parameter is like kwargs in Python
regplot <- function(x,y, ...) {
fit = lm(y~x)
plot(x,y, ...)
abline(fit, col="red")
summary(fit)
return(fit)
}
In [108]:
regplot(Carseats$Price, Carseats$Sales, xlab="Price", ylab="Sales", col="blue", pch=20)
Out[108]:
In [93]:
carfit = lm(Price ~ Sales, data = Carseats)
summary(carfit)
Out[93]:
In [91]:
testfit = regplot(Carseats$Price, Carseats$Sales)
testfit$call
Out[91]:
In [95]:
colnames(Carseats)# Carseats$Price
Out[95]:
In [97]:
x = "Price"
index = which(colnames(Carseats) == x)
colnames(Carseats)[index]
Out[97]:
In [89]:
data(iris)
class(iris)
irisfit = lm(Sepal.Length ~ Sepal.Width, data = iris)
#summary(irisfit)
irisfit$coefficients[2]
Out[89]:
Out[89]: